### IMPORTING R SCRIPTS
rm(list = ls())
source("../code/setup.R")
knitr::opts_chunk$set(comment = "", collapse = FALSE)
options(max.print="100")
source("../code/var_dict.R", encoding = "UTF-8")## Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
## This warning is displayed once per session.
source("../code/functions.R")
### SETTING GLOBAL VARIABLES - CICLES TO HOURS:
### (X hours) * (60 minutes / 1 hour) * (1 cycle / 15 minutes)
RESAMPLE <- 15 # 15 min = 1 cycle
RUL_THRESHOLD <- 24 * 60 / RESAMPLE # (9hr) * (60min / 1hr) * (1ciclo / 15min)
TTF_THRESHOLD <- RUL_THRESHOLD / 8 + 1
KNEE_POINT <- RUL_THRESHOLD * 4
source("../code/utils.R", encoding = "UTF-8")## Warning: 952 failed to parse.
source_python("../code/setup.py")
source_python("../code/GBClassifier.py")
source_python("../code/utils.py")cat("RESAMPLE RATE: 1 cycle = ", RESAMPLE, "minutes\n",
"MACHINE FAILURE DETECTED WITHIN:", RUL_THRESHOLD * RESAMPLE / 60, "hours =",
RUL_THRESHOLD, "cycles\n",
"MINIMUM DATA REQUIERED:", TTF_THRESHOLD * RESAMPLE / 60, "hours =",
TTF_THRESHOLD, "cycles")RESAMPLE RATE: 1 cycle = 15 minutes
MACHINE FAILURE DETECTED WITHIN: 24 hours = 96 cycles
MINIMUM DATA REQUIERED: 3.25 hours = 13 cycles
Data is resampled every 15 minutes.
cat("RESAMPLED MOTOR DATA:", "\n",
"[Rows - Observations]:", dim(motor)[1],
"\n[Columns - Variables]:", dim(motor)[2])RESAMPLED MOTOR DATA:
[Rows - Observations]: 384740
[Columns - Variables]: 50
cat("TRANSMUTED MINE DATA", "\n",
"[Rows - Observations]:", dim(mine)[1],
"\n[Columns - Variables]:", dim(mine)[2])TRANSMUTED MINE DATA
[Rows - Observations]: 5719
[Columns - Variables]: 15
cat("SAP DATA:", "\n",
"[Rows - Observations]:", dim(sap)[1],
"\n[Columns - Variables]:", dim(sap)[2])SAP DATA:
[Rows - Observations]: 2002
[Columns - Variables]: 10
First, we concatenate SAP data failures and join daily mine operating data. Missing variables are assesed by removing columns that have a high amount of missing data and dropping missing rows in the case that the fraction of missing data is negligeable.
### STANDARIZE COLS
dt <- dt %>%
# mutate_at(c("maint", "fail"), as.factor) %>%
mutate_at(c("fail_code", "maint_code", "fail_id", "maint_id",
"cycle", "RUL", "caex", "RUL_1", "RUL_2", "RUL_CLASS",
"LIFETIME"), as.integer) %>%
mutate(cycle_norm = as.double(cycle))
cat("ORIGINAL MERGED DATA:", "\n",
"[Rows - Observations]:", dim_dt_1[1],
"\n[Columns - Variables]:", dim_dt_1[2],
"\nDROP FAIL TIMESTAMPS:", "\n",
"[Rows - Observations]:", dim_dt_2[1],
"\n[Columns - Variables]:", dim_dt_2[2])ORIGINAL MERGED DATA:
[Rows - Observations]: 385930
[Columns - Variables]: 63
DROP FAIL TIMESTAMPS:
[Rows - Observations]: 361415
[Columns - Variables]: 63
################################################
dt %>%
select(colnames(dt)[colnames(dt) %in% colnames(
select(dt_part, colnames(motor)))]) %>% #, colnames(mine)))]) %>%
gg_miss_var()dt %>%
group_by(fail_SS, fail_id) %>%
summarise(LIFETIME = n()) %>%
mutate(LIFETIME = LIFETIME * RESAMPLE / 60) %>%
ggplot(aes(x = LIFETIME, y = fail_SS, fill = ..x..)) +
geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
scale_fill_viridis(name = "LIFETIME", option = "C") +
labs(title = "Duration by Failure") +
theme_ipsum() +
theme(legend.position = "none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8))Picking joint bandwidth of 55.5
Top Engine failures
dt %>%
group_by(fail_SS, fail_id, caex) %>%
summarise(LIFESPAN = n()) %>%
group_by(fail_SS) %>%
arrange(LIFESPAN, .by_group = TRUE) %>%
filter(fail_SS == "engine") %>%
tail() %>%
rtable()fail_SS | fail_id | caex | LIFESPAN |
engine | 663 | 106 | 1412 |
engine | 468 | 92 | 1744 |
engine | 401 | 89 | 1829 |
engine | 688 | 106 | 2257 |
engine | 477 | 97 | 2750 |
engine | 342 | 85 | 2927 |
num_vars <- names(which(sapply(dt, is.numeric)))
num_vars <- num_vars[num_vars %in% names(which(sapply(dt, is.double)))]
dt <- dt %>%
fill(num_vars, .direction = "downup") %>%
select_if(~sum(!is.na(.)) >= nrow(dt))
### FACTOR VARIABLES
int_vars <- names(which(sapply(dt, is.integer)))
fct_vars <- int_vars[int_vars %in% c("maint_code", "caex", "fail_code")]
int_vars <- int_vars[!int_vars %in% fct_vars]
### NUMERICAL VARIABLES
num_vars <- names(which(sapply(dt, is.numeric)))
num_vars <- num_vars[num_vars %in% names(which(sapply(dt, is.double)))]
num_vars <- num_vars[!num_vars %in% c(fct_vars, int_vars)]
### MISCELANIOUS VARIABLES
misc_vars <- colnames(dt)[!colnames(dt) %in% c(num_vars, fct_vars, int_vars)]
### DROP NEAR ZERO VARIANCE VARIABLES
zeroVar_dt <- dt %>%
select(c(num_vars)) %>%
nearZeroVar(names = T, saveMetrics = T,
foreach = T, allowParallel = T) %>%
mutate(near_zero_var = rownames(.)) %>%
filter(nzv == TRUE) %>%
transmute(near_zero_var = near_zero_var,
freq_ratio = freqRatio,
unique_pct = percentUnique)
zeroVar_dt %>% rtable()near_zero_var | freq_ratio | unique_pct |
dt_cor <- function(df, title) {
corrplot(df %>% select(num_vars[!num_vars %in% c("MTTF")]) %>% cor(),
title = title,
type = "upper",
order = "alphabet",
hclust.method = "centroid",
tl.cex = 1.5,
tl.col = "black",
tl.srt = 45,
sig.level = 0.01,
insig = "blank",
na.label = "X")}
dt %>%
filter(RUL_CLASS == 0) %>%
filter(fail_SS == "engine") %>%
select(num_vars) %>%
dt_cor(paste("\n\nNormal:\nTIME ABOVE",
RUL_THRESHOLD * RESAMPLE / 60,
"HOURS"))### ROWS
dt_heat <- dt
# filter(fail_type_chr != "struct_brake")
row_names <- pull(dt_heat, fail_SS) %>% unique()
### COLUMNS
dt_heat <- dt_heat %>%
select(num_vars, "fail_SS") %>%
group_by(fail_SS) %>%
summarise_if(is.numeric, mean, na.rm=TRUE) %>%
mutate_at(vars(num_vars), normalize) %>%
ungroup()
col_names <- colnames(dt_heat %>% select(-fail_SS))
### TRANSPOSE
dt_heat <- dt_heat %>%
group_by(fail_SS) %>%
transpose() %>%
mutate_all(as.numeric) %>%
as.matrix()
### ADD NAMES
dt_heat <- dt_heat[-1, ]
colnames(dt_heat) <- row_names
rownames(dt_heat) <- col_names
### GENERATE HEATMAP
row_dend = hclust(dist(dt_heat)) # row clustering
col_dend = hclust(dist(t(dt_heat))) # column clustering
dt_heat %>%
Heatmap(
name = "Value",
col = circlize::colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")),
row_names_gp = gpar(fontsize = 6.5),
cluster_rows = color_branches(row_dend, k = 4),
cluster_columns = color_branches(col_dend, k = 3))decode_SS <- function(e) {
e %>%
mutate(
SS = case_when(
fail_code == 1 ~ unique(pull(filter(fail_tags, fail_code == 1), fail_SS)),
fail_code == 2 ~ unique(pull(filter(fail_tags, fail_code == 2), fail_SS)),
fail_code == 3 ~ unique(pull(filter(fail_tags, fail_code == 3), fail_SS)),
fail_code == 4 ~ unique(pull(filter(fail_tags, fail_code == 4), fail_SS)),
fail_code == 5 ~ unique(pull(filter(fail_tags, fail_code == 5), fail_SS))
))}### TAKE ONLY THE TOP 12 ROUTES ###
dt %>%
group_by(fail_SS, fail_id) %>%
summarise(data_freq = n()) %>%
ungroup() %>%
group_by(fail_SS) %>%
summarise(fail_type_freq = n(),
data_freq = sum(data_freq)) %>%
ggplot(aes(reorder(fail_SS, fail_type_freq), y = fail_type_freq, fill = data_freq)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Failed Components",
y = "Frequency",
title = "Top Failure Modes") +
theme(legend.position = "none",
axis.title.y = element_text(size = 20),
axis.text.y = element_text(size = 20),
axis.title.x = element_text(size = 25),
axis.text.x = element_text(size = 25),
plot.title = element_text(size = 35, hjust = 0.2)) +
scale_fill_gradientn(name = "",
colours = rev(brewer.pal(10, "Spectral"))) +
geom_text(aes(label = fail_type_freq), hjust = 1.1, size = 8.5) +
coord_flip()# p_var <- num_vars[!num_vars %in% c(colnames(mine))][1:1]
p_var <- c("RUL")
p_gg <- function()
list(
labs(col = "Index"),
theme_bw(),
scale_color_gradientn(colours = rainbow(10)))
for (i in 1:(length(p_var))) {
p1 <- dt %>%
# select(-cycle) %>%
ggplot(aes(x = cycle, y = pull(dt, p_var[i]),
group = fail_id, col = fail_id)) +
geom_point(alpha = 0.5) +
p_gg() +
labs(title = "Cycles to Failure",
x = "Cycles",
y = p_var[i])
p2 <- dt %>%
# select(-cycle) %>%
ggplot(aes(x = cycle, y = pull(dt, p_var[i]),
group = fail_id, col = fail_id)) +
p_gg() +
labs(title = "Time to Failure",
x = "RUL",
y = p_var[i]) +
geom_line()
grid.arrange(p1, p2, nrow = 2, ncol = 1)}sensor_cols <- num_vars[num_vars %in% colnames(motor)]
if (!file.exists("../output/dt_features.csv")) {
dt <- py$add_features(dt, as.integer(8), sensor_cols)
dt %>% write.csv("../output/dt_features.csv", row.names = FALSE)
} else {
dt <- as_tibble(fread("../output/dt_features.csv", encoding = "UTF-8",
showProgress = FALSE))
}
new_features <- colnames(dt)
new_features <- c(
new_features[str_detect(new_features, "(\\w)+_SD")],
new_features[str_detect(new_features, "(\\w)+_AV")])tr <- dt %>% filter(fail_SS != "powertrain")
### ADD TEST DATA CONDITIONAL 20% TOTAL FAILURES
tr <- tr %>%
group_by(caex) %>%
mutate(
test_fail = max(fail_id) - 1,
test = if_else(fail_id == test_fail, 1, 0)) %>%
### ADDING CAEX 89 CRITICAL FAILURE
mutate(test = if_else(caex == 89 &
SS_comp == "Motor" &
LIFETIME == 1829, 1, test)) %>%
ungroup()
tr <- tr %>%
select(c(num_vars, fct_vars, misc_vars, int_vars, "test", new_features))
paste("FIRST DATE:", (tr %>% filter(test == 1) %>% head(1) %>% pull(dttm)))[1] "FIRST DATE: 2019-09-18 10:45:00"
te <- tr %>%
# filter(cycle < ran_cycle) %>%
filter(test == 1)
tr <- tr %>%
mutate(fail_id = as.integer(fail_id)) %>%
filter(test == 0)
te %>%
group_by(fail_id, fail_SS, caex, fail_code) %>%
summarise(LIFETIME = mean(LIFETIME),
FIRST_TTF = max(RUL),
LAST_TTF = min(RUL),
LAG_TTF = mean(LAG_RUL, na.rm=TRUE),
MAX_CYCLE = max(cycle),
MIN_CYCLE = min(cycle),
# CUTOFF_CYCLE = max(ran_cycle)
) %>% rtable()fail_id | fail_SS | caex | fail_code | LIFETIME | FIRST_TTF | LAST_TTF | LAG_TTF | MAX_CYCLE | MIN_CYCLE |
44 | electrical | 53 | 3 | 661 | 277 | 0 | 260 | 661 | 1 |
81 | engine | 57 | 1 | 175 | 174 | 0 | 64 | 175 | 1 |
129 | electrical | 62 | 3 | 71 | 70 | 0 | 254 | 71 | 1 |
176 | engine | 63 | 1 | 106 | 105 | 0 | 240 | 106 | 1 |
200 | engine | 64 | 1 | 449 | 65 | 0 | 32 | 449 | 1 |
228 | structure | 67 | 5 | 1071 | 687 | 0 | 915 | 1071 | 1 |
259 | hidraulics | 74 | 2 | 233 | 232 | 0 | 756 | 233 | 1 |
290 | structure | 80 | 5 | 16 | 15 | 0 | 45 | 16 | 1 |
321 | cab_attach | 83 | 4 | 722 | 338 | 0 | 205 | 722 | 1 |
353 | cab_attach | 85 | 4 | 469 | 85 | 0 | 594 | 469 | 1 |
384 | electrical | 86 | 3 | 32 | 31 | 0 | 75 | 32 | 1 |
401 | engine | 89 | 1 | 1829 | 1445 | 0 | 725 | 1829 | 1 |
409 | cab_attach | 89 | 4 | 412 | 28 | 0 | 399 | 412 | 1 |
440 | cab_attach | 90 | 4 | 1995 | 1611 | 0 | 42 | 1995 | 1 |
469 | electrical | 92 | 3 | 388 | 4 | 0 | 1744 | 388 | 1 |
495 | structure | 97 | 5 | 548 | 164 | 0 | 95 | 548 | 1 |
520 | structure | 98 | 5 | 95 | 94 | 0 | 27 | 95 | 1 |
554 | hidraulics | 99 | 2 | 1694 | 1310 | 0 | 371 | 1694 | 1 |
575 | hidraulics | 100 | 2 | 786 | 402 | 0 | 1775 | 786 | 1 |
596 | engine | 102 | 1 | 186 | 185 | 0 | 2365 | 186 | 1 |
636 | electrical | 103 | 3 | 351 | 350 | 0 | 49 | 351 | 1 |
659 | hidraulics | 105 | 2 | 2341 | 1957 | 0 | 623 | 2341 | 1 |
688 | engine | 106 | 1 | 2257 | 1873 | 0 | 935 | 2257 | 1 |
eXtreme Gradient Boosting: Multiclass
X_tr = r.tr
X_tr = X_tr
y_tr = X_tr["RUL_CLASS"]
bad_cols = ["dttm"]
for col in X_tr:
if X_tr[col].nunique() <= 1:
print(col)
bad_cols.append(col)test
cols_to_drop = ['maint_id', 'LIFETIME', 'RUL', 'RUL_1', 'RUL_2', "RUL_CLASS",
"dttm", "MTTF", "maint_code", "fail_SS", "fail_id", "fail_code",
"SS_comp"] + bad_cols
X_tr.columns = [str(col) for col in X_tr.columns]
cat_cols = ['maint_SS']
xgb_params = {
'colsample_bytree': 0.8,
'learning_rate': 0.1, #0.05
'max_depth': 9,
'subsample': 1,
'objective':'multi:softprob',
'num_class':3,
'eval_metric':'merror',
'min_child_weight':6,
'gamma':1, #0.25
"n_jobs" : 8,
'n_estimators': 100, # 300
'verbose': 1000,
'early_stopping_rounds': 100,
}
ct = CategoricalTransformer(drop_original=True, cat_cols=cat_cols)
ft = FeatureTransformer()
transformers = {'ft': ft, 'ct': ct}
folds = GroupKFold(n_splits = 4)xgb_model = ClassifierModel(model_wrapper=XGBWrapper())
xgb_model.fit(X=X_tr, y=y_tr, folds=folds, params=xgb_params, preprocesser=None, transformers=transformers, eval_metric='cappa', cols_to_drop=cols_to_drop)Fold 1 started at Thu Jan 16 01:52:10 2020
[0] validation_0-merror:0.149758 validation_1-merror:0.28608 validation_0-cappa:-0.465242 validation_1-cappa:-0.07564
Multiple eval metrics have been passed: 'validation_1-cappa' will be used for early stopping.
Will train until validation_1-cappa hasn't improved in 100 rounds.
[99] validation_0-merror:0.030843 validation_1-merror:0.329706 validation_0-cappa:-0.9004 validation_1-cappa:-0.008062
Fold 2 started at Thu Jan 16 01:55:31 2020
[0] validation_0-merror:0.153264 validation_1-merror:0.29313 validation_0-cappa:-0.435045 validation_1-cappa:0.019125
Multiple eval metrics have been passed: 'validation_1-cappa' will be used for early stopping.
Will train until validation_1-cappa hasn't improved in 100 rounds.
[99] validation_0-merror:0.031689 validation_1-merror:0.306221 validation_0-cappa:-0.894728 validation_1-cappa:-0.042114
Fold 3 started at Thu Jan 16 01:58:47 2020
[0] validation_0-merror:0.169499 validation_1-merror:0.304162 validation_0-cappa:-0.348335 validation_1-cappa:0.023783
Multiple eval metrics have been passed: 'validation_1-cappa' will be used for early stopping.
Will train until validation_1-cappa hasn't improved in 100 rounds.
[99] validation_0-merror:0.033079 validation_1-merror:0.311745 validation_0-cappa:-0.902507 validation_1-cappa:-0.088396
Fold 4 started at Thu Jan 16 02:02:01 2020
[0] validation_0-merror:0.156903 validation_1-merror:0.299635 validation_0-cappa:-0.40984 validation_1-cappa:-0.018407
Multiple eval metrics have been passed: 'validation_1-cappa' will be used for early stopping.
Will train until validation_1-cappa hasn't improved in 100 rounds.
[99] validation_0-merror:0.031589 validation_1-merror:0.362664 validation_0-cappa:-0.898832 validation_1-cappa:0.0063
CV mean score on validation_0: 0.8991 +/- 0.0028 std.
CV mean score on validation_1: 0.0331 +/- 0.0365 std.
precision recall f1-score support
0 0.80 0.83 0.81 278731
1 0.17 0.11 0.13 45804
2 0.09 0.11 0.10 28359
accuracy 0.68 352894
macro avg 0.35 0.35 0.35 352894
weighted avg 0.66 0.68 0.67 352894
X_te = r.te
y_te = X_te["RUL_CLASS"]
pred_RUL = xgb_model.predict(X_te)
pred_arg = xgb_model.predict(X_te).argmax(1)alarm <- te %>%
select(fail_id, caex, fail_SS, RUL, LIFETIME, cycle) %>%
cbind(pred_RUL = py$pred_RUL) %>%
group_by(fail_id) %>%
ungroup()
red_light <-
te %>%
group_by(fail_id, caex) %>%
summarise() %>%
left_join(
alarm %>%
rename(prob_normal = pred_RUL.1) %>%
group_by(fail_id) %>%
mutate(prob_alarm = (pred_RUL.2 + 2 * pred_RUL.3)/3) %>%
filter(prob_alarm + lead(prob_alarm) + lead(prob_alarm, 2) > 0.185 * 3) %>%
select(-c("pred_RUL.2", "pred_RUL.3")) %>%
top_n(n = 1, wt = LIFETIME - cycle),
by = c("caex", "fail_id"))
red_light %>% rtable(
)fail_id | caex | fail_SS | RUL | LIFETIME | cycle | prob_normal | prob_alarm |
44 | 53 | electrical | 87 | 661 | 574 | 0.4989948 | 0.1856343 |
81 | 57 | engine | 157 | 175 | 18 | 0.6032961 | 0.1793893 |
129 | 62 | electrical | 70 | 71 | 1 | 0.4563932 | 0.2262334 |
176 | 63 | engine | 92 | 106 | 14 | 0.5240706 | 0.1895707 |
200 | 64 | engine | 65 | 449 | 2 | 0.4080988 | 0.2045117 |
228 | 67 | structure | 95 | 1071 | 976 | 0.4868736 | 0.1947064 |
259 | 74 | hidraulics | 111 | 233 | 122 | 0.6018161 | 0.1865952 |
290 | 80 | structure | 15 | 16 | 1 | 0.2392261 | 0.4815701 |
321 | 83 | cab_attach | 96 | 722 | 626 | 0.7187064 | 0.1353826 |
353 | 85 | cab_attach | 85 | 469 | 1 | 0.3030930 | 0.2571960 |
384 | 86 | electrical | 31 | 32 | 1 | 0.2534213 | 0.4327923 |
401 | 89 | NA | NA | NA | NA | NA | |
409 | 89 | cab_attach | 28 | 412 | 1 | 0.5091531 | 0.2795297 |
440 | 90 | cab_attach | 94 | 1995 | 1901 | 0.4814275 | 0.1955547 |
469 | 92 | electrical | 4 | 388 | 1 | 0.2256989 | 0.4376528 |
495 | 97 | structure | 93 | 548 | 455 | 0.5688560 | 0.1879514 |
520 | 98 | structure | 94 | 95 | 1 | 0.4075282 | 0.2251982 |
554 | 99 | hidraulics | 96 | 1694 | 1598 | 0.6927352 | 0.1343032 |
575 | 100 | hidraulics | 91 | 786 | 695 | 0.5318039 | 0.1755869 |
596 | 102 | engine | 95 | 186 | 91 | 0.4035516 | 0.2276198 |
636 | 103 | electrical | 95 | 351 | 256 | 0.5916126 | 0.1702460 |
659 | 105 | hidraulics | 96 | 2341 | 2245 | 0.7078847 | 0.1278846 |
688 | 106 | engine | 97 | 2257 | 2160 | 0.7427008 | 0.1554266 |
According to the book Data Science for Business, different classification models could be compared using the Expected Value calculation. This is achieved by constructing a cost-benefit matrix in line with the model confusion matrix, and then converting model performance to a single monetary value by multiplying confusion matrix into the cost-benefit matrix using the formula:
Expected Profit = Probability(+ve) x [TPR x benefit(TP) + FNR x cost(FN)] + Probability(-ve) x [TNR x benefit(TN) + FPR x cost(FP)]
Cost-benefit matrix should be provided by business domain experts. For this project, the following values must be provided:
TP = nrow(red_light %>% filter(RUL < RUL_THRESHOLD + RUL_THRESHOLD/6 &
RUL > RUL_THRESHOLD - RUL_THRESHOLD/6))
TP[1] 15
TN = nrow(red_light %>% filter(RUL > RUL_THRESHOLD + RUL_THRESHOLD/6 |
RUL < RUL_THRESHOLD - RUL_THRESHOLD/6))
TN[1] 7
This could be implemented by adding false test data.
[1] 1
[1] 0.6521739
I tried to search why my probabilities where so small and I crumbled upon this answer:
One way to look at the threshold is that when you set it to 0.1 you are implicitly specifying a loss function. That is, separating the question of what to do (e.g. approach a customer) from what to infer (e.g. that the probability is of 1 is 0.15). Indeed, you might make this separation a bit more explicit in your question. For example, you talk about needing to approach 5% of some people for something to be worthwhile. And then about how well you can predict cases. Is the issue that to approach the `right’ 5% (presumably the true ’1’s) you might have to approach many more (true ’0’s) to no effect? Then the cost of approach is relevant and the threshold should be set to minimise loss. But you also say you can predict the held out cases well when the threshold is set at 0.1…
p_alarm <- alarm %>%
rename(pred_normal = pred_RUL.1) %>%
group_by(fail_id) %>%
mutate(prob_alarm = (pred_RUL.2 + 2 * pred_RUL.3)/3) %>%
ungroup() %>%
filter(RUL < 128)
p_var <- pull(red_light %>%
filter(RUL < RUL_THRESHOLD + RUL_THRESHOLD/6 &
RUL > RUL_THRESHOLD - RUL_THRESHOLD/6), fail_id)
p_gg <- function()
list(
labs(col = "Index"),
theme_stata())
for (i in 1:(length(p_var))) {
p1 <- p_alarm %>%
filter(fail_id == p_var[i]) %>%
ggplot(aes(x = cycle * 15 / 60, y = prob_alarm)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", span = 1/2) +
p_gg() +
labs(title = "Probability To Failure",
x = "Hours To Failure",
y = "Probability",
subtitle = paste(
"Failed Sub-Sistem Component:",
filter(p_alarm, fail_id == p_var[i]) %>% pull(fail_SS),
"\nFail ID:", p_var[i]))
grid.arrange(p1, nrow = 1, ncol = 1)}eXtreme Gradient Boosting: Multiclass
X_tr = r.tr
X_tr = X_tr[X_tr["RUL_1"] == 1]
y_tr = X_tr["fail_code"] - 1
bad_cols = []
for col in X_tr:
if X_tr[col].nunique() <= 1:
print(col)
bad_cols.append(col)RUL_1
test
cols_to_drop = ['maint_id', 'LIFETIME', 'RUL', 'RUL_1', 'RUL_2', "RUL_CLASS", "ran_cycle","dttm", "MTTF", "maint_code", "fail_SS", "fail_id", "fail_code", "SS_comp"] + bad_cols
X_tr.columns = [str(col) for col in X_tr.columns]
cat_cols = ['maint_SS']
xgb_params = {
'colsample_bytree': 0.8,
'learning_rate': 0.1, ### 0.05
'max_depth': 9,
'subsample': 1,
'objective':'multi:softprob',
'num_class':5,
'eval_metric':'merror',
'min_child_weight':6,
'gamma':0.25,
"n_jobs" : 8,
'n_estimators':100, ### 300
'nthread': 6,
'verbose': 1000,
'early_stopping_rounds': 100,
}
ct = CategoricalTransformer(drop_original=True, cat_cols=cat_cols)
ft = FeatureTransformer()
transformers = {'ft': ft, 'ct': ct}
folds = GroupKFold(n_splits = 6)xgb_model = ClassifierModel(model_wrapper=XGBWrapper())
xgb_model.fit(X=X_tr, y=y_tr, folds=folds, params=xgb_params, preprocesser=None, transformers=transformers,
eval_metric='cappa', cols_to_drop=cols_to_drop)Fold 1 started at Thu Jan 16 02:05:31 2020
[0] validation_0-merror:0.229919 validation_1-merror:0.786864 validation_0-cappa:-0.580573 validation_1-cappa:0.567523
Multiple eval metrics have been passed: 'validation_1-cappa' will be used for early stopping.
Will train until validation_1-cappa hasn't improved in 100 rounds.
[99] validation_0-merror:1.6e-05 validation_1-merror:0.797784 validation_0-cappa:-0.999993 validation_1-cappa:0.3749
Fold 2 started at Thu Jan 16 02:06:48 2020
[0] validation_0-merror:0.223722 validation_1-merror:0.641349 validation_0-cappa:-0.575524 validation_1-cappa:0.476323
Multiple eval metrics have been passed: 'validation_1-cappa' will be used for early stopping.
Will train until validation_1-cappa hasn't improved in 100 rounds.
[99] validation_0-merror:0.000178 validation_1-merror:0.609965 validation_0-cappa:-0.999667 validation_1-cappa:0.030558
Fold 3 started at Thu Jan 16 02:08:07 2020
[0] validation_0-merror:0.212913 validation_1-merror:0.740678 validation_0-cappa:-0.62386 validation_1-cappa:0.326893
Multiple eval metrics have been passed: 'validation_1-cappa' will be used for early stopping.
Will train until validation_1-cappa hasn't improved in 100 rounds.
[99] validation_0-merror:1.6e-05 validation_1-merror:0.732832 validation_0-cappa:-0.999993 validation_1-cappa:0.414509
Fold 4 started at Thu Jan 16 02:09:26 2020
[0] validation_0-merror:0.190526 validation_1-merror:0.633036 validation_0-cappa:-0.627567 validation_1-cappa:0.32184
Multiple eval metrics have been passed: 'validation_1-cappa' will be used for early stopping.
Will train until validation_1-cappa hasn't improved in 100 rounds.
[99] validation_0-merror:6.5e-05 validation_1-merror:0.610364 validation_0-cappa:-0.999839 validation_1-cappa:0.187801
Fold 5 started at Thu Jan 16 02:10:45 2020
[0] validation_0-merror:0.239106 validation_1-merror:0.624575 validation_0-cappa:-0.612178 validation_1-cappa:0.118006
Multiple eval metrics have been passed: 'validation_1-cappa' will be used for early stopping.
Will train until validation_1-cappa hasn't improved in 100 rounds.
[99] validation_0-merror:9.7e-05 validation_1-merror:0.682657 validation_0-cappa:-0.999846 validation_1-cappa:0.227895
Fold 6 started at Thu Jan 16 02:12:04 2020
[0] validation_0-merror:0.264996 validation_1-merror:0.689694 validation_0-cappa:-0.402547 validation_1-cappa:0.287257
Multiple eval metrics have been passed: 'validation_1-cappa' will be used for early stopping.
Will train until validation_1-cappa hasn't improved in 100 rounds.
[99] validation_0-merror:9.7e-05 validation_1-merror:0.693739 validation_0-cappa:-0.99981 validation_1-cappa:0.21288
CV mean score on validation_0: 0.9999 +/- 0.0001 std.
CV mean score on validation_1: -0.2414 +/- 0.1266 std.
precision recall f1-score support
0 0.32 0.25 0.28 15148
1 0.14 0.10 0.12 9708
2 0.29 0.22 0.25 12404
3 0.39 0.60 0.47 27321
4 0.16 0.06 0.08 9582
accuracy 0.33 74163
macro avg 0.26 0.24 0.24 74163
weighted avg 0.29 0.33 0.30 74163
X_te = r.te
X_te = X_te[X_te["RUL"] > 48]
y_te = X_te["fail_code"]
prediction = xgb_model.predict(X_te).argmax(1)te_xgb_fail <- te %>%
filter(RUL > 48) %>%
select(fail_id, caex, fail_SS, RUL_1) %>%
cbind(pred_code = py$prediction + 1,
real_code = py$y_te) %>%
filter(RUL_1 == 1) %>%
add_count(fail_id, pred_code)
te_xgb_fail %>%
group_by(fail_id) %>%
mutate(Majority = pred_code[n == max(n)][1]) %>%
select(-n) %>%
group_by(fail_id, caex, fail_SS) %>%
summarise(real_code = mean(real_code),
pred_code = mean(Majority)) %>%
mutate(fail_code = pred_code) %>%
decode_SS() %>%
rename(pred_SS = SS,
real_SS = fail_SS) %>%
select(-c("fail_code")) %>%
rtable()fail_id | caex | real_SS | real_code | pred_code | pred_SS |
44 | 53 | electrical | 3 | 3 | electrical |
81 | 57 | engine | 1 | 1 | engine |
129 | 62 | electrical | 3 | 3 | electrical |
176 | 63 | engine | 1 | 1 | engine |
200 | 64 | engine | 1 | 1 | engine |
228 | 67 | structure | 5 | 5 | structure |
259 | 74 | hidraulics | 2 | 4 | cab_attach |
321 | 83 | cab_attach | 4 | 4 | cab_attach |
353 | 85 | cab_attach | 4 | 4 | cab_attach |
401 | 89 | engine | 1 | 4 | cab_attach |
440 | 90 | cab_attach | 4 | 4 | cab_attach |
495 | 97 | structure | 5 | 5 | structure |
520 | 98 | structure | 5 | 5 | structure |
554 | 99 | hidraulics | 2 | 2 | hidraulics |
575 | 100 | hidraulics | 2 | 2 | hidraulics |
596 | 102 | engine | 1 | 1 | engine |
636 | 103 | electrical | 3 | 3 | electrical |
659 | 105 | hidraulics | 2 | 2 | hidraulics |
688 | 106 | engine | 1 | 1 | engine |
X_te = r.te
X_te = X_te[X_te["RUL"] > 16]
y_te = X_te["fail_code"]
prediction = xgb_model.predict(X_te).argmax(1)te_xgb_fail <- te %>%
filter(RUL > 16) %>%
select(fail_id, caex, fail_SS, RUL_1) %>%
cbind(pred_code = py$prediction + 1,
real_code = py$y_te) %>%
filter(RUL_1 == 1) %>%
add_count(fail_id, pred_code)
te_xgb_fail %>%
group_by(fail_id) %>%
mutate(Majority = pred_code[n == max(n)][1]) %>%
select(-n) %>%
group_by(fail_id, caex, fail_SS) %>%
summarise(real_code = mean(real_code),
pred_code = mean(Majority)) %>%
mutate(fail_code = pred_code) %>%
decode_SS() %>%
rename(pred_SS = SS,
real_SS = fail_SS) %>%
select(-c("fail_code")) %>%
rtable()fail_id | caex | real_SS | real_code | pred_code | pred_SS |
44 | 53 | electrical | 3 | 3 | electrical |
81 | 57 | engine | 1 | 1 | engine |
129 | 62 | electrical | 3 | 3 | electrical |
176 | 63 | engine | 1 | 1 | engine |
200 | 64 | engine | 1 | 1 | engine |
228 | 67 | structure | 5 | 5 | structure |
259 | 74 | hidraulics | 2 | 4 | cab_attach |
321 | 83 | cab_attach | 4 | 4 | cab_attach |
353 | 85 | cab_attach | 4 | 4 | cab_attach |
384 | 86 | electrical | 3 | 3 | electrical |
401 | 89 | engine | 1 | 4 | cab_attach |
409 | 89 | cab_attach | 4 | 4 | cab_attach |
440 | 90 | cab_attach | 4 | 4 | cab_attach |
495 | 97 | structure | 5 | 5 | structure |
520 | 98 | structure | 5 | 5 | structure |
554 | 99 | hidraulics | 2 | 2 | hidraulics |
575 | 100 | hidraulics | 2 | 2 | hidraulics |
596 | 102 | engine | 1 | 1 | engine |
636 | 103 | electrical | 3 | 3 | electrical |
659 | 105 | hidraulics | 2 | 2 | hidraulics |
688 | 106 | engine | 1 | 1 | engine |
KFold Light GBM
from sklearn.metrics import mean_squared_error
target = "RUL"
X_tr = r.tr
X_tr = X_tr[X_tr["RUL"] >= 96]
bad_cols = ["dttm"]
for col in X_tr:
if X_tr[col].nunique() <= 1:
print(col)
bad_cols.append(col)RUL_1
RUL_2
RUL_CLASS
test
cols_to_drop = ['maint_id', 'LIFETIME', 'RUL', 'RUL_1', 'RUL_2', "RUL_CLASS",
"dttm", "MTTF", "fail_SS", "fail_id", "fail_code",
"SS_comp", "maint_SS", "cycle"] + bad_cols
X_tr.columns = [str(col) for col in X_tr.columns]
cat_cols = ['maint_code']
features = [col for col in X_tr.columns if col not in cols_to_drop]
### DEFINE FOLDS ###
folds = 4
seed = 999
gkf = KFold(n_splits = folds, shuffle = True, random_state = seed)
models = []
shap_values = np.zeros(X_tr[features].shape)
shap_sampling = 125000 ### SAMPLING SHAP REDUCED COMPUTACIONAL COST ###
oof_pred = np.zeros(X_tr.shape[0]) ### OUT OF FOLDS PREDICTIONS ###for i, (tr_idx, val_idx) in enumerate(gkf.split(X_tr)):
def fit_LGBMRegressor(tr_idx, val_idx): ### MEMORY CLOSURE
tr_x, tr_y = X_tr[features].iloc[tr_idx], X_tr[target].iloc[tr_idx]
vl_x, vl_y = X_tr[features].iloc[val_idx], X_tr[target].iloc[val_idx]
print({"fold": i, "train size": len(tr_x), "eval size": len(vl_x)})
tr_data = lgb.Dataset(tr_x, label = tr_y,
categorical_feature = cat_cols)
vl_data = lgb.Dataset(vl_x, label = vl_y,
categorical_feature = cat_cols)
clf = lgb.LGBMRegressor(n_estimators = 250, # 900,
learning_rate = 0.1,
subsample = 0.25,
subsample_freq = 1,
n_jobs = 8,
feature_fraction = 0.9,
num_leaves = 80,
lambda_l1 = 0.5,
lambda_l2 = 0.5,
seed = i, ### SEED DIVERSIFICATION
metric = "rmse")
clf.fit(tr_x, tr_y, eval_set = [(vl_x, vl_y)],
verbose = 150)
fold_importance = shap.TreeExplainer(clf).shap_values(vl_x[:shap_sampling])
valid_prediticion = clf.predict(vl_x, num_iteration = clf.best_iteration_)
oof_loss = np.sqrt(mean_squared_error(vl_y, valid_prediticion))
print(f"Fold:{i} RMSLE: {oof_loss:.4f}")
return clf, fold_importance, valid_prediticion
clf, shap_values[val_idx[:shap_sampling]], oof_pred[val_idx] = fit_LGBMRegressor(tr_idx, val_idx)
models.append(clf){'fold': 0, 'train size': 209048, 'eval size': 69683}
[150] valid_0's rmse: 83.9531
Fold:0 RMSLE: 69.8348
{'fold': 1, 'train size': 209048, 'eval size': 69683}
[150] valid_0's rmse: 81.4533
Fold:1 RMSLE: 68.5844
{'fold': 2, 'train size': 209048, 'eval size': 69683}
[150] valid_0's rmse: 80.0952
Fold:2 RMSLE: 68.5327
{'fold': 3, 'train size': 209049, 'eval size': 69682}
[150] valid_0's rmse: 80.9218
Fold:3 RMSLE: 68.5133
24
OOF RMSLE: 68.8686
Shap Summary Plot
### SHAP IMPORTANCE DATA FRAME ###
ma_shap = pd.DataFrame(sorted(zip(abs(shap_values).mean(axis=0), features), reverse = True), columns=["Mean Abs Shapley", "Feature"]).set_index("Feature")
shap.summary_plot(shap_values, X_tr[features], plot_size = None)Please note that the accuracy might be deceiving due to the fact that this model depends on the output of the multiclassification model. An overall accuracy is needed.
X_te = r.te
X_te = X_te[X_te["RUL"] >= 96]
# split test data into batches
set_size = len(X_te)
iterations = int(set_size / 5)
batch_size = set_size // iterations
print(set_size, iterations, batch_size)13515 2703 5
assert set_size == iterations * batch_size
ttf_pred = []
for i in (range(iterations)):
pos = i*batch_size
fold_preds = [(model.predict(X_te[features].iloc[pos : pos+batch_size])) for model in models]
ttf_pred.extend(np.mean(fold_preds, axis=0))
te_x, te_y = X_te[features], X_te[target]
pred = np.sqrt(mean_squared_error(X_te[target], ttf_pred))
print(f"OOF RMSLE: {pred:.4f}")OOF RMSLE: 233.9114
We proceed to visualize the RUL for the failures which might be right according to the model’s confidence.
plot_RUL <- function(idx) {
# e_pred <- paste0("Failed Sub Sistem: ",
# unique(pull(filter(te_xgb_fail, fail_id == idx), fail_SS)))
# e_real <- paste0("Fail ID: ",
# unique(pull(filter(te_xgb_fail, fail_id == idx), fail_id)))
p_RUL <- dt_te %>%
filter(fail_id == idx) %>%
mutate(cycle = cycle * 0.25,
pred_RUL = pred_RUL * 0.25,
real_RUL = real_RUL * 0.25)
p_RUL %>%
e_charts(cycle, width = "100%", height = 400) %>%
e_line("real_RUL", color = "#194f6e", name = "Real RUL") %>%
e_line("pred_RUL", color = "#b1254c",
name = "Predicted RUL") %>%
e_title(paste("RUL Prediction", "- Fail ID: ",
unique(pull(filter(te_xgb_fail, fail_id == idx), fail_id))),
paste("CAEX", unique(pull(filter(te_xgb_fail, fail_id == idx), caex)),
"- Failed Sub Sistem:",
unique(pull(filter(te_xgb_fail, fail_id == idx), fail_SS))),
textStyle=list(fontFamily="arial",
fontWeight="bold",
color="#4d4d4d"),
subtextStyle=list(fontFamily="arial", fontSize=16)) %>%
e_format_x_axis(suffix = "Hours") %>%
e_format_y_axis(suffix = "Hours") %>%
e_theme("dark") %>%
e_legend(bottom = 0, textStyle=list(fontFamily="arial", fontSize=14)) %>%
e_tooltip(trigger="axis",
axisPointer = list(
type = "cross"),
textStyle=list(fontFamily="arial", fontSize=14)) %>%
e_mark_point(serie = "Real RUL",
data = as.list(transmute(filter(filter(p_RUL, fail_id == idx),
cycle == max(cycle)),
xAxis = cycle,
yAxis = real_RUL,
value = paste(real_RUL, "hours")))) %>%
e_mark_point(serie = "Predicted RUL",
data = as.list(transmute(filter(filter(p_RUL, fail_id == idx),
cycle == max(cycle)),
xAxis = cycle,
yAxis = pred_RUL,
value = paste(round(pred_RUL, 0), "hours")))) %>%
e_toolbox_feature(feature = c("saveAsImage"))}
p_var <- pull(red_light %>%
filter(RUL < RUL_THRESHOLD + RUL_THRESHOLD/6 &
RUL > RUL_THRESHOLD - RUL_THRESHOLD/6), fail_id)A work by Alonso M., Cecil V.